#Amazon Prime Movies and TV link to kaggle: https://www.kaggle.com/datasets/dgoenrique/amazon-prime-movies-and-tv-shows
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
titles <- read.csv("/Users/leahboger/Desktop/amazon prime/titles.csv")
credits <- read.csv("/Users/leahboger/Desktop/amazon prime/credits.csv")
Q1) Which shows/movies have the largest difference in imdb score vs tmdb score, and what is that difference? A: Zone Drifter has the largest difference in scores (7.7 pt difference). After looking at the amount of votes/ popularity of the top five largest discrepancies, it seems that the larger discrepancies are very unpopular films, so there is most likely more variance in the ratings.
titles %>% mutate(score_diff = abs(imdb_score - tmdb_score)) %>% arrange(-score_diff) %>% select(title, score_diff, imdb_score, imdb_votes, tmdb_score, tmdb_popularity) %>% head(5)
## title score_diff
## 1 Zone Drifter 7.7
## 2 The 1975 'At Their Very Best' Live from Madison Square Garden 7.7
## 3 Lockdown: 2025 7.6
## 4 Warriors from Hell 7.4
## 5 Milgram and the Fastwalkers 7.4
## imdb_score imdb_votes tmdb_score tmdb_popularity
## 1 2.3 145 10 1.553
## 2 9.7 10 2 4.569
## 3 2.4 247 10 2.996
## 4 2.6 41 10 0.990
## 5 2.6 27 10 0.827
Q2) Create a graph to show how the average length of movies changed over time.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(mdsr)
avg_time_per_year <- titles %>% filter(type == "MOVIE") %>% group_by(release_year) %>% summarize(avg_length = mean(runtime))
static_plot <- avg_time_per_year %>% ggplot(aes(x = release_year, y = avg_length)) + geom_line() + xlab("Release Year") + ylab("Average Movie Length")
ggplotly(static_plot)
Q3) Make a graph to explore a possible correlation between runtime and average rating(the average of the imdb and tmdb scores).
#incorrect data point removal
titles %>% filter(runtime == 940) %>% select(title)
## title
## 1 Once Bitten
#after checking amazon.com, this movie is 93 minutes, below this data point will be fixed
titles %>% filter(runtime == 328) %>% select(title)
## title
## 1 Custer's Last Stand
#after checking amazon.com, this movie is 88 minutes, below this data point will be fixed
corrected_outliers <- titles %>% mutate(runtime = case_when(title == "Once Bitten" & runtime == 940 ~ 93, title == "Custer's Last Stand" & runtime == 328 ~ 88, TRUE ~ runtime))
avg_rating <- corrected_outliers %>% mutate(avg_rating = (imdb_score + tmdb_score)/2) %>% select(type, avg_rating, runtime)
static_plot2 <- avg_rating %>% ggplot(aes(x = runtime, y = avg_rating)) + geom_point() + facet_wrap( ~ type) + geom_smooth() + ylab("Average Rating") + xlab("Runtime") + xlim(0,350)
ggplotly(static_plot2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2666 rows containing non-finite values (`stat_smooth()`).
Q4) What actor has been in the most TV shows + Movies and which TV shows and movies were they in? What director? A: The actor in the most media is George ‘Gabby’ Hayes, being in 53 movies. The Director in the most media is Joesph Kane, working on 45 movies
#actor
top_actor <- credits %>% filter(role == "ACTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(person_id)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
top_actor_name <- credits %>% filter(role == "ACTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(name)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
ls_films_w_actor <- credits %>% filter(person_id == top_actor) %>% group_by(id) %>% summarize(id = id)
filmsact <- ls_films_w_actor %>% left_join(titles, by = c("id")) %>% select(title, type, seasons, imdb_score)
#top actor
top_actor_name
## [1] "George 'Gabby' Hayes"
filmsact
## # A tibble: 53 × 4
## title type seasons imdb_score
## <chr> <chr> <dbl> <dbl>
## 1 The Lucky Texan MOVIE NA 5.6
## 2 Blue Steel MOVIE NA 5.3
## 3 The Star Packer MOVIE NA 5.1
## 4 Riders of Destiny MOVIE NA 5.4
## 5 The Phantom Broadcast MOVIE NA 5.6
## 6 The Return of Casey Jones MOVIE NA 5.1
## 7 Skyway MOVIE NA 5.4
## 8 The Sphinx MOVIE NA 5.6
## 9 'Neath the Arizona Skies MOVIE NA 5.1
## 10 In Old Caliente MOVIE NA 5.6
## # ℹ 43 more rows
nrow(filmsact)
## [1] 53
unique(filmsact$type)
## [1] "MOVIE"
#director
top_director <- credits %>% filter(role == "DIRECTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(person_id)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
top_director_name <- credits %>% filter(role == "DIRECTOR") %>% group_by(person_id, name) %>% summarize(num_film = length(unique(id))) %>% arrange(-num_film) %>% head(1) %>% pull(name)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
ls_films_w_director <- credits %>% filter(person_id == top_director) %>% group_by(id) %>% summarize(id = id)
filmsdir <- ls_films_w_director %>% left_join(titles, by = c("id")) %>% select(title, type, seasons, imdb_score)
#top director
top_director_name
## [1] "Joseph Kane"
filmsdir
## # A tibble: 45 × 4
## title type seasons imdb_score
## <chr> <chr> <dbl> <dbl>
## 1 Man from Music Mountain MOVIE NA 6.2
## 2 Ghost-Town Gold MOVIE NA 6.1
## 3 Song of Texas MOVIE NA 5.9
## 4 The Old Corral MOVIE NA 5.7
## 5 Idaho MOVIE NA 6
## 6 Frontier Pony Express MOVIE NA 6.1
## 7 In Old Caliente MOVIE NA 5.6
## 8 Wall Street Cowboy MOVIE NA 5.8
## 9 Rough Riders' Round-up MOVIE NA 5.4
## 10 Southward Ho! MOVIE NA 6.2
## # ℹ 35 more rows
nrow(filmsdir)
## [1] 45
unique(filmsdir$type)
## [1] "MOVIE"
Q5) Graph the number of productions a person worked on versus the average imdb score those productions have. Facet by role in the media.
number_films <- credits %>% group_by(person_id) %>% summarize(num_film = length(unique(id)))
person_w_id <- credits %>% group_by(person_id, id, role) %>% summarize(id = id) %>% select(person_id, id, role)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'person_id', 'id', 'role'. You can override
## using the `.groups` argument.
imdb_w_filmnum <- person_w_id %>% left_join(titles, by = c("id")) %>% group_by(person_id, role) %>% summarize(avg_imdb = mean(imdb_score)) %>% left_join(number_films, by = c("person_id"))
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
imdb_w_filmnum %>% ggplot(aes(x= num_film, y = avg_imdb)) + geom_point() + facet_wrap( ~ role) + xlab("Number of Productions Worked On") + ylab("Average IMDB Score")
## Warning: Removed 6307 rows containing missing values (`geom_point()`).